import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as pl
import scanpy.api as sc
import pandas as pd
import matplotlib.pyplot as plt
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=100, color_map='viridis') # low dpi (dots per inch) yields small inline figures
sc.logging.print_versions()
#Set working directory
wd = '/path/to/working/directory/'
results_file = './write/Embryo10Xv6_genes_PCAbatchCorrected.h5ad'
results_fileMerged = './write/Embryo10Xv6_genes_PCAbatchCorrected_mergedAGA.h5ad'
##====== Read normalised counts ======##
filename_data = wd + 'data/countsData_norm.mtx'
filename_gene_names = wd + 'data/genes.txt'
filename_barcodes = wd + 'data/cells.txt'
#filename_clustersAll = wd + 'data/cellTypes_20180215.txt'
print('reading counts')
adata_all_cells = sc.read(filename_data, cache=True).transpose()
print('reading genes')
adata_all_cells.var_names = np.genfromtxt(filename_gene_names, dtype='str')
print('reading cells')
adata_all_cells.obs_names = np.genfromtxt(filename_barcodes, dtype='str')
##====== Substitute gene IDs for geneNames ======##
genes = np.genfromtxt(filename_gene_names, dtype='str')
geneNameFilename = wd + 'data/rawData/genes.tsv'
geneName = pd.read_table(geneNameFilename,delimiter="\t",header=None,names=["geneID","geneName"])
geneName = geneName.set_index(geneName.geneID)
#print(metadata[adata.smp_names,:])
geneNameData = geneName.loc[genes,'geneName']
adata_all_cells.var_names = geneNameData
adata_all_cells.var['geneID'] = genes
print(adata_all_cells.var_names)
##====== Check number of total cells -rows- ======##
adata_all_cells.shape
##====== Read metadata file and set index to cell ======##
metadataFilename = wd + 'data/rawData/meta.tab'
meta = pd.read_table(metadataFilename,delimiter="\t",header=0)
meta = meta.set_index(meta.cell)
##====== Check number of total cells in metadata ======##
meta.shape
##====== Filter metadata so that the cells correspond to those in the counts matrix ======##
metadata00 = pd.DataFrame(np.array(meta)[np.array([cell in adata_all_cells.obs_names for cell in meta['cell']]),:])
metadata00.columns = meta.columns
metadata00 = metadata00.set_index(metadata00.cell)
metadata00.shape
##====== Check columns in metadata ======##
metadata00.columns
##====== Add metadata to adata ======##
adata_all_cells.obs['sample'] = list(metadata00.loc[:,'sample'])
adata_all_cells.obs['stage'] = list(metadata00.loc[:,'stage'])
adata_all_cells.obs['batch'] = list(metadata00.loc[:,'batch'])
adata_all_cells.obs['theiler'] = list(metadata00.loc[:,'theiler'])
adata_all_cells.obs['doub.density'] = list(metadata00.loc[:,'doub.density'])
adata_all_cells.obs['doublet'] = list(metadata00.loc[:,'doublet'])
adata_all_cells.obs['cluster'] = list(metadata00.loc[:,'cluster'])
adata_all_cells.obs['clustersub'] = list(metadata00.loc[:,'cluster.sub'])
adata_all_cells.obs['clusterstage'] = list(metadata00.loc[:,'cluster.stage'])
adata_all_cells.obs['clustertheiler'] = list(metadata00.loc[:,'cluster.theiler'])
adata_all_cells.obs['stripped'] = list(metadata00.loc[:,'stripped'])
adata_all_cells.obs['clusterIdx'] = list(metadata00.loc[:,'cluster']-1)
adata_all_cells.obs['clustercat'] = ['cluster'+ str(name) for name in list(metadata00.loc[:,'cluster'])]
##====== Read batch corrected PCA ======##
PCAfilename = wd + 'data/rawData/PCAall_20180830.mtx'
filename_cells = wd + 'data/rawData/PCAall_cellnames_2018830.tab'
#filename_clustersAll = wd + 'data/cellTypes_20180215.txt'
print('reading counts')
adataPCA = sc.read(PCAfilename, cache=True).transpose()
#print('reading genes')
#adata_all_genes.var_names = np.genfromtxt(filename_gene_names, dtype='str')
print('reading cells')
adataPCA.obs_names = np.genfromtxt(filename_cells, dtype='str')
adataPCA.shape
##====== Filter those cells that are in PCA - these will be the passed-QC cells ======##
adata = adata_all_cells[np.array([cell in adataPCA.obs_names for cell in adata_all_cells.obs_names]),:]
##====== Check how many cells passed QC ======##
adata.shape
##====== Add batch corrected PCA to adata object ======##
adata.obsm['X_pca'] = adataPCA.X.todense()
del adataPCA
del adata_all_cells
##====== Filter out stripped nuclei - they should be already filtered out but just in case ======##
metadata0 = pd.DataFrame(np.array(meta)[np.array([cell == False for cell in meta['stripped']]),:])
metadata0.columns = meta.columns
metadata0 = metadata0.set_index(metadata0.cell)
#metadata0
##====== Filter out doublets - they should be already filtered out but just in case ======##
metadata1 = pd.DataFrame(np.array(metadata0)[np.array([cell == False for cell in metadata0['doublet']]),:])
metadata1.columns = metadata0.columns
metadata1 = metadata1.set_index(metadata1.cell)
#metadata1
##====== Filter out these cells in adata - you should see they are the same dimensions ======##
adata = adata[np.array([cell in metadata0['cell'] for cell in adata.obs_names]),:]
adata.shape
##====== Make cluster names of the clustersub column unique ======##
stringClust = list()
for i in range(0,len(adata.obs['cluster'])):
stringClust.append(str(int(adata.obs['cluster'][i]))+str(int(adata.obs['clustersub'][i])))
adata.obs['clusterSubUnique'] = stringClust
##====== Read tSNE file ======##
tSNEfilename = wd + 'data/blanca_tsne.tab'
tSNE = pd.read_table(tSNEfilename,delimiter="\t",header=None,names=["tsne1","tsne2","cluster","celltype"])
##====== Add tSNE to adata ======##
adata.obs['tsne1'] = list(tSNE['tsne1'])
adata.obs['tsne2'] = list(tSNE['tsne2'])
adata.obs['clustertsne'] = list(tSNE['cluster'])
adata.obs['celltype'] = list(tSNE['celltype'])
tsne_x = [float(i) for i in tSNE.loc[:,'tsne1']]
tsne_y = [float(i) for i in tSNE.loc[:,'tsne2']]
coords_matrix = np.array([tsne_x, tsne_y])
print(coords_matrix.T.shape)
#Add coordinates in to adata file
adata.obsm['X_tsne'] = np.array(coords_matrix.T)
##====== Read tSNE file ======##
UMAPfilename = wd + 'data/umap.tab'
UMAP = pd.read_table(UMAPfilename,delimiter="\t",header=None,names=["umap1","umap2"])
UMAP_x = [float(i) for i in UMAP.loc[:,'umap1']]
UMAP_y = [float(i) for i in UMAP.loc[:,'umap2']]
coords_matrix = np.array([UMAP_x, UMAP_y])
print(coords_matrix.T.shape)
#Add coordinates in to adata file
adata.obsm['X_umap'] = np.array(coords_matrix.T)
##====== Check cell types ======##
np.unique(adata.obs['celltype'])
##====== Write results ======##
sc.write(results_file,adata)
##====== Read adata object ======##
adata = sc.read(results_file)
##====== Create colour palette for cell types ======##
all_colours = {
"Allantois" : "#532C8A",#[32] "Allantois"
"Cardiac mesenchyme" : "#F7901D",#[23] "Cardiac mesenchyme"
"Cardiomyocytes" : "#B51D8D",#[34] "Cardiomyocytes"
"Def. endoderm" : "#F397C0",#[24] "Def. endoderm"
"Early mixed mesoderm" : "#C594BF",#[7] "Early mixed mesoderm"
"Early posterior mesoderm" : "#DFCDE4",#[26] "Early ExE mesoderm"
"Early neurectoderm" : "#A0CC47",#[10] "Early neurectoderm"
"Early paraxial mesoderm" : "#3F84AA",#[17] "Early paraxial mesoderm"
"Endothelium" : "#B3793B",#[20] "Endothelium"
"Epiblast" : "#683612",#[1] "Epiblast"
"Erythroid 1" : "#C72228",#[15] "Erythroid 1"
"Erythroid 2" : "#EF4E22",#[37] "Erythroid2"
"ExE ectoderm 1" : "#989898",#[30] "ExE ectoderm 1"
"ExE ectoderm 2" : "#333333",#[4] "ExE ectoderm 2"
"ExE endoderm" : "#7F6874",#[5] "ExE endoderm"
"ExE mesoderm" : "#7253A2",#[12] "ExE mesoderm"
"Forebrain" : "#65A83E",#[8] "Forebrain"
"Foregut" : "#EF5A9D",#[19] "Foregut"
"Haemato-endothelial progenitors" : "#FBBE92",#[9] "Hemato-endothelial progenitors"
"Intermediate mesoderm" : "#139992",#[31] "Intermediate mesoderm"
"Pharyngeal mesoderm" : "#C9EBFB",#[13] "Late mixed mesoderm"
"Late paraxial mesoderm" : "#8DB5CE",#[33] "Late paraxial mesoderm (presomitic mesoderm)"
"Midgut/Hindgut": "#CE4E82",#[35] "Midgut/Hindgut"
"Midbrain/Hindbrain" : "#354E23",#[11] "Midbrain/Hindbrain"
"Trunk neural crest" : "#77783C",#[18] "Neural crest"
"NMP" : "#8EC792",#[14] "NMPs"
"Notochord" : "#0F4A9C",#[21] "Notochord"
"PGC" : "#FACB12",#[25] "PGC"
"Placodes" : "#BBDCA8",#[27] "Placodes"
"Parietal endoderm" : "#1A1A1A",#[29] "Parietal endoderm"
"Cranial neural crest" : "#C3C388",#[36] "Pre-migratory neural crest"
"Primitive Streak" : "#DABE99",#[2] "Primitive Streak"
"Somites" : "#005579",#[16] "Somites"
"Spinal cord" : "#CDE088",#[38] "Spinal cord"
"Surface ectoderm" : "#FFF574",#[22] "Surface ectoderm"
"Visceral endoderm" : "#F6BFCB",#[3] "Visceral endoderm"
"New": "gray"
}#light green
colPalette = [all_colours[i] for i in sorted(np.unique(adata.obs['celltype']))]
##====== Create colour palettes for stages ======##
bluePal = [ "#E3FCFA","#C1ECEF", "#A3D4E3", "#86B8D6","#6C98CA","#5476BE","#3E52B1","#2B2DA5","#2B1999"]
spectralPal = [ "#D53E4F","#F46D43","#FDAE61","#FFFFBF","#FEE08B","#E6F598","#ABDDA4","#3288BD", "#66C2A5","#A9A9A9"]
##====== Create colour palette for gene expression profiles ======##
from matplotlib.colors import LinearSegmentedColormap
rmap = LinearSegmentedColormap.from_list(name='gene_cmap',
colors=['lightgrey', 'thistle', 'red', 'darkred'])
cmap = LinearSegmentedColormap.from_list(name='gene_cmap',
colors=["#BFBFBF","#6495ED","#000000"])
##====== Read adata object ======##
adata = sc.read(results_file)
##====== Plot tSNE coloured by cell types ======##
sc.pl.tsne(adata,color='celltype',palette=colPalette,save="tsne_celltype.png")
##====== Plot tSNE coloured by time-points ======##
sc.pl.tsne(adata,color='stage',palette=spectralPal,save="tsne_timepoint.png")
##====== Plot UMAP coloured by cell types ======##
sc.pl.scatter(adata,color='celltype',basis="umap",palette=colPalette,save="umap_celltype.png")
##====== Plot UMAP coloured by time-points ======##
sc.pl.scatter(adata,color='stage',basis="umap",palette=spectralPal,save="umap_timepoint.png")
##====== Read adata object ======##
adata = sc.read(results_file)
##====== Check number of unique clusters ======##
len(np.unique(adata.obs['clusterSubUnique']))
##====== Compute diffusion maps using the batch corrected PCA space ======##
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, use_rep='X_pca',
knn=True, random_state=0, method='umap', metric='euclidean', metric_kwds={}, copy=False)
sc.tl.diffmap(adata)
##====== Compute PAGA using the diffusion maps space ======##
sc.pp.neighbors(adata,n_neighbors=15,use_rep='X_diffmap')
sc.tl.paga(adata, groups='clusterSubUnique')
sc.pl.paga_compare(adata, basis='tsne',threshold_solid=0.1,edge_width_scale=0.1, save="tsne_clustersSubUnique_AGA.png")
sc.pl.paga_compare(adata, basis='umap',threshold_solid=0.1,edge_width_scale=0.1, save="umap_clustersSubUnique_AGA.png")
##====== Check how connected the subclusters within each cluster are ======##
fig = plt.figure(figsize=(15,10))
clusters = np.unique(adata.obs["cluster"])
for j in range(0,len(np.unique(adata.obs['cluster']))):
number=clusters[j]
adataSub = adata[np.array([clust == number for clust in adata.obs['cluster']]),:]
#print("adataSub shape")
#print(adataSub.shape)
mat = adata.uns['paga']['connectivities'].todense()
#print("mat shape")
#print(mat.shape)
#print(labelsPlot0)
labelsPlot = number
clusterSubUn = np.unique(adataSub.obs['clusterSubUnique'])
#print(len(clusterSubUn))
clustBool = [clust in clusterSubUn for clust in np.unique(adata.obs['clusterSubUnique'])]
#print(clustBool)
# print(mat.shape)
# print(len(clustBool))
matsub = mat[clustBool,:][:,clustBool]
plt.subplot(5,5,j+1)
#print(matsub.shape)
plt.imshow(matsub, cmap='Reds', interpolation='nearest',vmin=0,vmax=1)
plt.title('cluster'+str(labelsPlot))
plt.colorbar()
plt.grid(True)
#print(matsub)
plt.subplots_adjust(
left = 0.125, # the left side of the subplots of the figure
right = 0.9, # the right side of the subplots of the figure
bottom = 0.1, # the bottom of the subplots of the figure
top = 0.9, # the top of the subplots of the figure
wspace = 0.5, # the amount of width reserved for blank space between subplots
hspace = 0.6 ) # the amount of height reserved for white space between subplots
fig.show()
fig.savefig('./figures/AdjacencyMatrices_subclusters.pdf', bbox_inches='tight')
##====== Merge and separate subclusters setting a threshold where dendrograms should split ======##
##====== As a positive control of something that needs to be separated, use the PGCs ======##
clusters = np.unique(adata.obs["cluster"])
dictNewClust = {}
fig2 = plt.figure(figsize=(20,15))
from scipy import cluster
from scipy.cluster.hierarchy import dendrogram, linkage
#adata = sc.read(results_file2)
thres = 1.6
def getMaxIdx(data):
np.arg.max(data)
sepClust = list()
for j in range(0,len(np.unique(adata.obs['cluster']))):
number=clusters[j]
adataSub = adata[np.array([clust == number for clust in adata.obs['cluster']]),:]
#print("adataSub shape")
#print(adataSub.shape)
mat = adata.uns['paga']['connectivities'].todense()
#print("mat shape")
#print(mat.shape)
labelsPlot = number
clusterSubUn = np.unique(adataSub.obs['clusterSubUnique'])
# print(clusterSubUn)
clustBool = [clust in clusterSubUn for clust in np.unique(adata.obs['clusterSubUnique'])]
#print(clustBool)
# print(mat.shape)
# print(len(clustBool))
matsub = mat[clustBool,:][:,clustBool]
matdist = 1-matsub
np.fill_diagonal(matdist,0)
#print(lk)
lk = linkage(matdist, "ward")
cutree = cluster.hierarchy.cut_tree(lk, height=thres)
# print(cutree)
for q in range(0,len(list(cutree))):
name= 'cluster'+ str(clusterSubUn[q])
dictNewClust[name] = cutree[q]
#print(cutree[q])
# print(dictNewClust)
plt.subplot(5,5,j+1)
dendrogram(lk,color_threshold=thres)
plt.title('cluster'+str(labelsPlot))
plt.grid(True)
#maxVal = np.apply_along_axis(max, axis=1, arr=matsub )
#print(maxVal)
#a = list([i < thres for i in maxVal ])
#print(a)
#print(enumerate(a))
#aIdx = [i for i, x in enumerate(a) if x]
#print(aIdx)
#if a.any():
# for k in list(clusterSubUn[aIdx]):
# sepClust.append(float(k))
plt.subplots_adjust(
left = 0.125, # the left side of the subplots of the figure
right = 0.9, # the right side of the subplots of the figure
bottom = 0.1, # the bottom of the subplots of the figure
top = 0.9, # the top of the subplots of the figure
wspace = 0.5, # the amount of width reserved for blank space between subplots
hspace = 0.6 ) # the amount of height reserved for white space between subplots
fig2.show()
fig2.savefig('./figures/Dendrograms_subclusters.pdf', bbox_inches='tight')
##====== Get merged subclustering annotation and add it to adata object ======##
dictCells = {}
for i in range(0,len(np.unique(list(adata.obs['cluster'])))):
number = np.unique(list(adata.obs['cluster']))[i]
#print(number)
adataSub = adata[np.array([clust == number for clust in adata.obs['cluster']]),:]
uniqueClust = np.unique(list(adataSub.obs['clusterSubUnique']))
# print(uniqueClust)
listClust = list()
for j in uniqueClust:
dictCells[(str(j))] = (str(int(number))+ str(int(dictNewClust['cluster'+str(j)])))
#print("\n")
dictcells0 = pd.DataFrame.from_dict(dictCells,orient='index',columns=["merged"])
adata.obs['clusterMergedHierarchy'] = list(dictcells0.loc[adata.obs['clusterSubUnique'],'merged'])
##====== Check number of merged subclusters ======##
len(np.unique(adata.obs['clusterMergedHierarchy']))
##====== Write results ======##
sc.write(results_file,adata)
##====== Read adata object ======##
adata = sc.read(results_file)
##====== Compute PAGA on merged subclusters ======##
sc.pp.neighbors(adata,n_neighbors=15,use_rep='X_diffmap')
sc.tl.paga(adata, groups='clusterMergedHierarchy')
##====== Plot tSNE and PAGA coloured by merged subclusters ======##
sc.pl.paga_compare(adata, basis='tsne',threshold_solid=0.25,edge_width_scale=0.1,save="tsne_clustersMerged_AGA.png")
sc.pl.paga_compare(adata, basis='umap',threshold_solid=0.25,edge_width_scale=0.1,save="tsne_clustersMerged_AGA.png")
##====== Plot PAGA coloured by merged subclusters ======##
sc.pl.paga(adata, threshold_solid=0.5,fontsize=7,edge_width_scale=0.1,save="clustersMerged_AGA.png")
##====== Write results ======##
sc.write(results_fileMerged,adata)
##====== Write metadata with merged subclusters ======##
metadataWrite = pd.DataFrame(data=adata.obs)
metadataWrite.to_csv('metadata_mergedSubclusters.txt', sep='\t')